Main Figures
Have a legend for all plots
Since we have set a parameter in our set_colors.R file that we source at the beginning and use this throughout, we can assure that the below legends will represent the legends called afterwards.
####### Legend for lakesite
legend_plot <- ggplot(metadata, aes(y = frac_bacprod, x = fraction, fill = fraction, shape = season)) +
geom_jitter(size = 3, width = 0.2) +
scale_fill_manual(values = fraction_colors, guide = guide_legend(override.aes = list(shape = 22, size = 4))) +
scale_shape_manual(values = season_shapes, guide = guide_legend(override.aes = list(size = 4))) +
theme(legend.position = "bottom", axis.title.x = element_blank(),
legend.title = element_blank())
# Extract the legend
season_legend <- get_legend(legend_plot)
####### Legend for lakesite
lakesite_legend <- ggplot(metadata, aes(y = frac_bacprod, x = fraction, fill = fraction, shape = lakesite)) +
geom_jitter(size = 3, width = 0.2) +
scale_fill_manual(values = fraction_colors, guide = guide_legend(override.aes = list(shape = 22, size = 4))) +
scale_shape_manual(values = lakesite_shapes, guide = guide_legend(override.aes = list(size = 4))) +
theme(legend.position = "bottom", axis.title.x = element_blank(),
legend.title = element_blank())
# Extract the legend
lakesite_legend <- get_legend(lakesite_legend)
Figure 1
######################################################### Fraction ABUNDANCe
frac_abund_wilcox <- wilcox.test(log10(as.numeric(fraction_bac_abund)) ~ fraction, data = metadata)
frac_abund_wilcox
##
## Wilcoxon rank sum test
##
## data: log10(as.numeric(fraction_bac_abund)) by fraction
## W = 0, p-value = 1.479e-06
## alternative hypothesis: true location shift is not equal to 0
metadata %>%
group_by(fraction) %>%
summarize(mean(as.numeric(fraction_bac_abund)))
## # A tibble: 2 x 2
## fraction `mean(as.numeric(fraction_bac_abund))`
## <fctr> <dbl>
## 1 Particle 41168.88
## 2 Free 734522.25
# Make a data frame to draw significance line in boxplot (visually calculated)
dat1 <- data.frame(a = c(1.1,1.1,1.9,1.9), b = c(6.45,6.5,6.5,6.45)) # WholePart vs WholeFree
poster_a <-
ggplot(filter(metadata, norep_filter_name != "MOTEJ515"),
aes(y = log10(as.numeric(fraction_bac_abund)), x = fraction)) +
geom_jitter(size = 3, aes(fill = fraction, shape = season), width = 0.2) +
geom_boxplot(alpha = 0.5, outlier.shape = NA, aes( fill = fraction)) +
ylab("Log10(Bacterial Counts) \n (cells/mL)") +
scale_fill_manual(values = fraction_colors) +
scale_shape_manual(values = season_shapes) +
##### Particle vs free cell abundances
geom_path(data = dat1, aes(x = a, y = b), linetype = 1, color = "gray40") +
annotate("text", x=1.5, y=6.55, size = 8, color = "gray40", label= paste("***")) +
annotate("text", x=1.5, y=6.4, size = 3.5, color = "gray40",
label= paste("p =", round(frac_abund_wilcox$p.value, digits = 6))) +
theme(legend.position = "bottom",
legend.title = element_blank(),
axis.title.x = element_blank())
######################################################### TOTAL PRODUCTION
totprod_wilcox <- wilcox.test(frac_bacprod ~ fraction, data = metadata)
totprod_wilcox
##
## Wilcoxon rank sum test
##
## data: frac_bacprod by fraction
## W = 33, p-value = 0.02418
## alternative hypothesis: true location shift is not equal to 0
metadata %>%
group_by(fraction) %>%
summarize(mean(frac_bacprod))
## # A tibble: 2 x 2
## fraction `mean(frac_bacprod)`
## <fctr> <dbl>
## 1 Particle 9.958333
## 2 Free 24.058333
# Make a data frame to draw significance line in boxplot (visually calculated)
dat2 <- data.frame(a = c(1.1,1.1,1.9,1.9), b = c(71,72,72,71)) # WholePart vs WholeFree
poster_b <- ggplot(metadata, aes(y = frac_bacprod, x = fraction)) +
geom_jitter(size = 3, aes(fill = fraction, shape = season), width = 0.2) +
geom_boxplot(alpha = 0.5, outlier.shape = NA, aes( fill = fraction)) +
scale_fill_manual(values = fraction_colors, guide = FALSE) +
scale_shape_manual(values = season_shapes) +
scale_y_continuous(limits = c(0, 78), expand = c(0,0), breaks = seq(0, 75, by = 15)) +
ylab("Community Production \n (μgC/L/day)") +
##### Particle vs free bulk production
geom_path(data = dat2, aes(x = a, y = b), linetype = 1, color = "gray40") +
annotate("text", x=1.5, y=73, size = 8, color = "gray40", label= paste("*")) +
annotate("text", x=1.5, y=69, size = 3.5, color = "gray40",
label= paste("p =", round(totprod_wilcox$p.value, digits = 3))) +
theme(legend.position = "none",
axis.title.x = element_blank())
######################################################### TOTAL PRODUCTION
percellprod_wilcox <- wilcox.test(log10(fracprod_per_cell) ~ fraction, data = filter(metadata, norep_filter_name != "MOTEJ515"))
percellprod_wilcox
##
## Wilcoxon rank sum test
##
## data: log10(fracprod_per_cell) by fraction
## W = 125, p-value = 6.656e-05
## alternative hypothesis: true location shift is not equal to 0
filter(metadata, norep_filter_name != "MOTEJ515") %>%
group_by(fraction) %>%
summarize(mean(fracprod_per_cell))
## # A tibble: 2 x 2
## fraction `mean(fracprod_per_cell)`
## <fctr> <dbl>
## 1 Particle 4.816116e-07
## 2 Free 3.866798e-08
# Make a data frame to draw significance line in boxplot (visually calculated)
dat3 <- data.frame(a = c(1.1,1.1,1.9,1.9), b = c(-5.05,-5,-5,-5.05)) # WholePart vs WholeFree
poster_c <- ggplot(filter(metadata, norep_filter_name != "MOTEJ515"),
aes(y = log10(fracprod_per_cell), x = fraction)) +
geom_jitter(size = 3, aes(fill = fraction, shape = season), width = 0.2) +
geom_boxplot(alpha = 0.5, outlier.shape = NA, aes( fill = fraction)) +
scale_fill_manual(values = fraction_colors, guide = FALSE) +
scale_shape_manual(values = season_shapes) +
ylim(c(-8.5, -4.9)) +
ylab("Log10(Per-Capita Production) \n(μgC/cell/day)") +
##### Particle vs free per-cell production
geom_path(data = dat3, aes(x = a, y = b), linetype = 1, color = "gray40") +
annotate("text", x=1.5, y=-4.95, size = 8, color = "gray40", label= paste("***")) +
annotate("text", x=1.5, y=-5.15, size = 3.5, color = "gray40",
label= paste("p =", round(percellprod_wilcox$p.value, digits = 5))) +
theme(legend.position = "none",
axis.title.x = element_blank())
######## FIGURE 1
# legend
legend <- get_legend(poster_a)
row1_plots <- plot_grid(poster_a + theme(legend.position = "none"), poster_b, poster_c,
labels = c("A", "B", "C"),
ncol = 3, nrow = 1)
#fig_1 <-
plot_grid(row1_plots, season_legend,
ncol = 1, nrow = 2,
rel_heights = c(1, 0.05))
Linear Regressions with Fraction Diversity
# Free-Living samples only
div_measures_free <- otu_alphadiv %>%
dplyr::select(fraction, mean, measure, frac_bacprod, fracprod_per_cell_noinf) %>%
filter(fraction == "Free") %>%
dplyr::select(-fraction)
# Particle-associated samples only
div_measures_part <- otu_alphadiv %>%
dplyr::select(fraction, mean, measure, frac_bacprod, fracprod_per_cell_noinf) %>%
filter(fraction == "Particle") %>%
dplyr::select(-fraction)
## All of the samples!
div_measures_all <- otu_alphadiv %>%
dplyr::select(mean, measure, frac_bacprod, fracprod_per_cell_noinf)
########################################################
############## Particle: Community-wide
lm_divs_particle_comm <- div_measures_part %>%
dplyr::select(-fracprod_per_cell_noinf) %>%
group_by(measure) %>%
do(glance(lm(frac_bacprod ~ mean, data = .))) %>%
mutate(fraction = "Particle", dependent = "Community Production") %>%
dplyr::select(fraction, dependent, measure, AIC, adj.r.squared, p.value)
############## Particle: Community-wide
lm_divs_particle_percap <- div_measures_part %>%
dplyr::select(-frac_bacprod) %>%
group_by(measure) %>%
do(glance(lm(log10(fracprod_per_cell_noinf) ~ mean, data = .))) %>%
mutate(fraction = "Particle", dependent = "Per-Capita Production") %>%
dplyr::select(fraction, dependent, measure, AIC, adj.r.squared, p.value)
########################################################
############## Free: Community-wide
lm_divs_free_comm <- div_measures_free %>%
dplyr::select(-fracprod_per_cell_noinf) %>%
group_by(measure) %>%
do(glance(lm(frac_bacprod ~ mean, data = .))) %>%
mutate(fraction = "Free", dependent = "Community Production") %>%
dplyr::select(fraction, dependent, measure, AIC, adj.r.squared, p.value)
############## Free: Community-wide
lm_divs_free_percap <- div_measures_free %>%
dplyr::select(-frac_bacprod) %>%
group_by(measure) %>%
do(glance(lm(log10(fracprod_per_cell_noinf) ~ mean, data = .))) %>%
mutate(fraction = "Free", dependent = "Per-Capita Production") %>%
dplyr::select(fraction, dependent, measure, AIC, adj.r.squared, p.value)
########################################################
############## All Samples: Community-wide
lm_divs_all_comm <- div_measures_all %>%
dplyr::select(-fracprod_per_cell_noinf) %>%
group_by(measure) %>%
do(glance(lm(frac_bacprod ~ mean, data = .))) %>%
mutate(fraction = "All Samples", dependent = "Community Production") %>%
dplyr::select(fraction, dependent, measure, AIC, adj.r.squared, p.value)
############## All Samples: Community-wide
lm_divs_all_percap <- div_measures_all %>%
dplyr::select(-frac_bacprod) %>%
group_by(measure) %>%
do(glance(lm(log10(fracprod_per_cell_noinf) ~ mean, data = .))) %>%
mutate(fraction = "All Samples", dependent = "Per-Capita Production") %>%
dplyr::select(fraction, dependent, measure, AIC, adj.r.squared, p.value)
## Filtered PCA Linear Model Results
# Put all of the PCA and environmental linear model results together into one dataframe
lm_div_results <-
bind_rows(lm_divs_particle_comm, lm_divs_particle_percap,
lm_divs_free_comm, lm_divs_free_percap,
lm_divs_all_comm, lm_divs_all_percap) %>%
dplyr::rename(independent_var = measure,
dependent_var = dependent)
See supplemental table 1 header for the table output!
Cross Validation
Cross validation will provide a better estimate for the adjusted R-squared value of our diversity regressions.
#################################### Community-Wide Production ####################################
################### Richness ###################
lm_prod_vs_rich_PA <- lm(frac_bacprod ~ mean, data = filter(richness, fraction == "Particle"))
# Cross validate particle-associated for a better estimate of the adjusted R-squared
cv_lm_prod_vs_rich_PA <- train(
frac_bacprod ~ mean, data = filter(richness, fraction == "Particle"),
method ='lm',
trControl = trainControl(method ="repeatedcv", number = 3, repeats = 100),
tuneGrid = expand.grid(intercept = TRUE))
cv_lm_prod_vs_rich_PA$results # Particle Community-Wide CV results
## intercept RMSE Rsquared MAE RMSESD RsquaredSD MAESD
## 1 TRUE 6.725198 0.6529792 5.428276 2.311594 0.2866787 1.946107
################### Shannon Entropy ###################
lm_prod_vs_shannon_PA <- lm(frac_bacprod ~ mean, data = filter(shannon, fraction == "Particle"))
cv_lm_prod_vs_shannon_PA <- train(
frac_bacprod ~ mean, data = filter(shannon, fraction == "Particle"),
method ='lm',
trControl = trainControl(method ="repeatedcv", number = 3, repeats = 100),
tuneGrid = expand.grid(intercept = TRUE))
cv_lm_prod_vs_shannon_PA$results # Particle Community-Wide CV results
## intercept RMSE Rsquared MAE RMSESD RsquaredSD MAESD
## 1 TRUE 6.518007 0.6893392 5.165152 2.500648 0.2708935 2.114215
################### Inverse Simpson ###################
lm_prod_vs_invsimps_PA <- lm(frac_bacprod ~ mean, data = filter(invsimps, fraction == "Particle"))
cv_lm_prod_vs_invsimps_PA <- train(
frac_bacprod ~ mean, data = filter(invsimps, fraction == "Particle"),
method ='lm',
trControl = trainControl(method ="repeatedcv", number = 3, repeats = 100),
tuneGrid = expand.grid(intercept = TRUE))
cv_lm_prod_vs_invsimps_PA$results # Particle Community-Wide CV results
## intercept RMSE Rsquared MAE RMSESD RsquaredSD MAESD
## 1 TRUE 5.223794 0.7684528 3.977896 2.169656 0.2517694 1.829427
################### Simpson's Evenness ###################
lm_prod_vs_simpseven_PA <- lm(frac_bacprod ~ mean, data = filter(simpseven, fraction == "Particle"))
cv_lm_prod_vs_simpseven_PA <- train(
frac_bacprod ~ mean, data = filter(simpseven, fraction == "Particle"),
method ='lm',
trControl = trainControl(method ="repeatedcv", number = 3, repeats = 100),
tuneGrid = expand.grid(intercept = TRUE))
cv_lm_prod_vs_simpseven_PA$results # Particle Community-Wide CV results
## intercept RMSE Rsquared MAE RMSESD RsquaredSD MAESD
## 1 TRUE 6.209806 0.6543889 4.794798 3.0062 0.3012353 2.279525
#################################### Per-Capita Production ####################################
################### Richness ###################
lm_percell_prod_vs_rich_PA <- lm(log10(fracprod_per_cell_noinf) ~ mean, data = filter(richness, fraction == "Particle"))
cv_lm_percell_prod_vs_rich_PA <- train(
log10(fracprod_per_cell_noinf) ~ mean, data = filter(richness, fraction == "Particle" & norep_filter_name != "MOTEJ515"),
method ='lm',
trControl = trainControl(method ="repeatedcv", number = 3, repeats = 100),
tuneGrid = expand.grid(intercept = TRUE))
cv_lm_percell_prod_vs_rich_PA$results # Particle Per-Capita CV results
## intercept RMSE Rsquared MAE RMSESD RsquaredSD MAESD
## 1 TRUE 0.4212636 0.6187559 0.337734 0.1590698 0.3371661 0.1396469
################### Shannon Entropy ###################
lm_percell_prod_vs_shannon_PA <- lm(log10(fracprod_per_cell_noinf) ~ mean, data = filter(shannon, fraction == "Particle"))
cv_lm_percell_prod_vs_shannon_PA <- train(
log10(fracprod_per_cell_noinf) ~ mean, data = filter(shannon, fraction == "Particle" & norep_filter_name != "MOTEJ515"),
method ='lm',
trControl = trainControl(method ="repeatedcv", number = 3, repeats = 100),
tuneGrid = expand.grid(intercept = TRUE))
cv_lm_percell_prod_vs_shannon_PA$results # Particle Per-Capita CV results
## intercept RMSE Rsquared MAE RMSESD RsquaredSD MAESD
## 1 TRUE 0.413872 0.6936693 0.3342346 0.1426615 0.3042119 0.1235188
################### Inverse Simpson ###################
lm_percell_prod_vs_invsimps_PA <- lm(log10(fracprod_per_cell_noinf) ~ mean, data = filter(invsimps, fraction == "Particle"))
cv_lm_percell_prod_vs_invsimps_PA <- train(
log10(fracprod_per_cell_noinf) ~ mean, data = filter(invsimps, fraction == "Particle" & norep_filter_name != "MOTEJ515"),
method ='lm',
trControl = trainControl(method ="repeatedcv", number = 3, repeats = 100),
tuneGrid = expand.grid(intercept = TRUE))
cv_lm_percell_prod_vs_invsimps_PA$results # Particle Per-Capita CV results
## intercept RMSE Rsquared MAE RMSESD RsquaredSD MAESD
## 1 TRUE 0.3571974 0.7204792 0.2890154 0.1120944 0.3183298 0.09172199
################### Simpson's Evenness ###################
lm_percell_prod_vs_simpseven_PA <- lm(log10(fracprod_per_cell_noinf) ~ mean, data = filter(simpseven, fraction == "Particle"))
cv_lm_percell_prod_vs_simpseven_PA <- train(
log10(fracprod_per_cell_noinf) ~ mean, data = filter(simpseven, fraction == "Particle" & norep_filter_name != "MOTEJ515"),
method ='lm',
trControl = trainControl(method ="repeatedcv", number = 3, repeats = 100),
tuneGrid = expand.grid(intercept = TRUE))
cv_lm_percell_prod_vs_simpseven_PA$results # Particle Per-Capita CV results
## intercept RMSE Rsquared MAE RMSESD RsquaredSD MAESD
## 1 TRUE 0.4322433 0.6583372 0.3502716 0.1437934 0.3106654 0.1156553
Table S3
########## PUT A TABLE TOGETHER
# Per Liter Production
perliter_row1 <- c("Richness", "Community-Wide",
round(summary(lm_prod_vs_rich_PA)$adj.r.squared, digits = 3),
round(cv_lm_prod_vs_rich_PA$results$Rsquared, digits = 3),
round(cv_lm_prod_vs_rich_PA$results$RsquaredSD, digits = 3))
perliter_row2 <- c("Shannon_Entropy", "Community-Wide",
round(summary(lm_prod_vs_shannon_PA)$adj.r.squared, digits = 3),
round(cv_lm_prod_vs_shannon_PA$results$Rsquared, digits = 3),
round(cv_lm_prod_vs_shannon_PA$results$RsquaredSD, digits = 3))
perliter_row3 <- c("Inverse_Simpson", "Community-Wide",
round(summary(lm_prod_vs_invsimps_PA)$adj.r.squared, digits = 3),
round(cv_lm_prod_vs_invsimps_PA$results$Rsquared, digits = 3),
round(cv_lm_prod_vs_invsimps_PA$results$RsquaredSD, digits = 3))
perliter_row4 <- c("Simpsons_Evenness","Community-Wide",
round(summary(lm_prod_vs_simpseven_PA)$adj.r.squared, digits = 3),
round(cv_lm_prod_vs_simpseven_PA$results$Rsquared, digits = 3),
round(cv_lm_prod_vs_simpseven_PA$results$RsquaredSD, digits = 3))
# Per cell production
percell_row1 <- c("Richness", "Per-Capita",
round(summary(lm_percell_prod_vs_rich_PA)$adj.r.squared, digits = 3),
round(cv_lm_percell_prod_vs_rich_PA$results$Rsquared, digits = 3),
round(cv_lm_percell_prod_vs_rich_PA$results$RsquaredSD, digits = 3))
percell_row2 <- c("Shannon Entropy", "Per-Capita",
round(summary(lm_percell_prod_vs_shannon_PA)$adj.r.squared, digits = 3),
round(cv_lm_percell_prod_vs_shannon_PA$results$Rsquared, digits = 3),
round(cv_lm_percell_prod_vs_shannon_PA$results$RsquaredSD, digits = 3))
percell_row3 <- c("Inverse Simpson", "Per-Capita",
round(summary(lm_percell_prod_vs_invsimps_PA)$adj.r.squared, digits = 3),
round(cv_lm_percell_prod_vs_invsimps_PA$results$Rsquared, digits = 3),
round(cv_lm_percell_prod_vs_invsimps_PA$results$RsquaredSD, digits = 3))
percell_row4 <- c("Simpsons Evenness", "Per-Capita",
round(summary(lm_percell_prod_vs_simpseven_PA)$adj.r.squared, digits = 3),
round(cv_lm_percell_prod_vs_simpseven_PA$results$Rsquared, digits = 3),
round(cv_lm_percell_prod_vs_simpseven_PA$results$RsquaredSD, digits = 3))
r2_table <- as.data.frame(rbind(perliter_row1, perliter_row2, perliter_row3, perliter_row4,
percell_row1, percell_row2, percell_row3, percell_row4))
colnames(r2_table) <- c("Diversity_Metric", "Production_Measure","Adjusted_R2","CV_R2", "CV_R2_SD")
row.names(r2_table) = NULL
datatable(r2_table,
caption = "Table S3: \n R2 estimates for heterotrophic production vs particle-associated diversity linear regressions.",
options = list(pageLength = 50), rownames = FALSE)
Figure 2
######################################################### OBSERVED RICHNESS #########################################################
rich_fraction_wilcox <- wilcox.test(mean ~ fraction, data = richness)
rich_fraction_wilcox
##
## Wilcoxon rank sum test
##
## data: mean by fraction
## W = 129, p-value = 0.0004955
## alternative hypothesis: true location shift is not equal to 0
filter(richness) %>%
group_by(fraction) %>%
summarize(mean(mean), sd(mean))
## # A tibble: 2 x 3
## fraction `mean(mean)` `sd(mean)`
## <fctr> <dbl> <dbl>
## 1 Particle 556.7992 167.31404
## 2 Free 338.4242 85.98665
# Make a data frame to draw significance line in boxplot (visually calculated)
rich_nums <- data.frame(a = c(1.15,1.15,1.85,1.85), b = c(920, 930, 930, 920)) # WholePart vs WholeFree
rich_distribution_plot <-
ggplot(richness, aes(y = mean, x = fraction)) +
scale_fill_manual(values = fraction_colors) +
geom_jitter(size = 3, aes(fill = fraction, shape = season), width = 0.2) +
geom_boxplot(alpha = 0.5, outlier.shape = NA, aes(fill = fraction)) +
scale_y_continuous(limits = c(150,980), breaks = seq(from = 0, to =925, by = 150)) +
xlab("Observed Richness") + xlab("Fraction") +
scale_shape_manual(values = season_shapes) +
geom_path(data = rich_nums, aes(x = a, y = b), linetype = 1, color = "#424645") +
annotate("text", x=1.5, y=980, size = 8, color = "#424645", label= paste("***"), angle = 90) +
annotate("text", x=1.5, y=790, size = 4, color = "#424645",
label= paste("p =", round(rich_fraction_wilcox$p.value, digits = 5))) +
theme(legend.position = "none",# axis.title.y = element_blank(),
axis.title.x = element_blank(), axis.text.x = element_blank()) +
coord_flip()
################ Richness vs Community-wide (Per-Liter) Production
## 1. Extract the R2 and p-value from the linear model:
lm_lab_perliter_rich_PA <- paste("atop(R^2 ==", round(summary(lm_prod_vs_rich_PA)$adj.r.squared, digits = 2), ",",
"p ==", round(unname(summary(lm_prod_vs_rich_PA)$coefficients[,4][2]), digits = 3), ")")
## 2. Plot Richness vs Community-wide (Per-Liter) Production
prod_vs_rich_plot <-
ggplot(richness, aes(x=mean, y=frac_bacprod)) +
geom_errorbarh(aes(xmin = mean - sd, xmax = mean + sd, color = fraction), alpha = 0.7) + # X-axis errorbars
geom_errorbar(aes(ymin = frac_bacprod - SD_frac_bacprod, ymax = frac_bacprod + SD_frac_bacprod, color = fraction)) + # Y-axis errorbars
geom_point(size = 3.5, color = "black", aes(fill = fraction, shape = season)) +
scale_color_manual(values = fraction_colors) +
scale_fill_manual(values = fraction_colors) +
scale_shape_manual(values = season_shapes) +
ylab("\n Community Production \n (μgC/L/day)") +
xlab("Observed Richness") +
geom_smooth(data=filter(richness, fraction == "Particle"), method='lm', color = "#FF6600", fill = "#FF6600") +
scale_x_continuous(limits = c(150,980), breaks = seq(from = 0, to =925, by = 150)) +
# Add the R2 and p-value to the plot
annotate("text", x=790, y=45, label=lm_lab_perliter_rich_PA, parse = TRUE, color = "#FF6600", size = 4) +
theme(legend.position = "none", axis.title.x = element_blank(), axis.text.x = element_blank())
################ Richness vs Per Capita (Per-Cell) Production
## 1. Extract the R2 and p-value from the linear model:
lm_lab_percell_rich_PA <- paste("atop(R^2 ==", round(summary(lm_percell_prod_vs_rich_PA)$adj.r.squared, digits = 2), ",",
"p ==", round(unname(summary(lm_percell_prod_vs_rich_PA)$coefficients[,4][2]), digits = 3), ")")
## 2. Plot Richness vs Per Capita (Per-Cell) Production
percell_prod_vs_rich_plot <-
ggplot(richness, aes(x=mean, y=log10(fracprod_per_cell_noinf))) +
geom_errorbarh(aes(xmin = mean - sd, xmax = mean + sd, color = fraction), alpha = 0.7) + # X-axis errorbars
geom_point(size = 3.5, color = "black", aes(fill = fraction, shape = season)) +
scale_color_manual(values = fraction_colors) +
scale_fill_manual(values = fraction_colors) +
scale_shape_manual(values = season_shapes) +
ylab("\n log10(Per-Capita Production)\n (μgC/cell/day)") +
xlab("Observed Richness") +
geom_smooth(data=filter(richness, fraction == "Particle"), method='lm', color = "#FF6600", fill = "#FF6600") +
scale_x_continuous(limits = c(150,980), breaks = seq(from = 0, to =925, by = 150)) +
#scale_y_continuous(limits = c(-8e-7,5e-6), breaks = seq(from = 0, to = 6e-7, by = 3e-7)) +
# Add the R2 and p-value to the plot
annotate("text", x=790, y=-7.5, label=lm_lab_percell_rich_PA, parse = TRUE, color = "#FF6600", size = 4) +
theme(legend.title = element_blank(), legend.position ="bottom",
legend.text = element_text(size = 14))
######################################################### INVERSE SIMPSON #########################################################
invsimps_fraction_wilcox <- wilcox.test(mean ~ fraction, data = invsimps)
invsimps_fraction_wilcox
##
## Wilcoxon rank sum test
##
## data: mean by fraction
## W = 81, p-value = 0.6297
## alternative hypothesis: true location shift is not equal to 0
filter(invsimps) %>%
group_by(fraction) %>%
summarize(mean(mean), sd(mean))
## # A tibble: 2 x 3
## fraction `mean(mean)` `sd(mean)`
## <fctr> <dbl> <dbl>
## 1 Particle 35.47659 23.843325
## 2 Free 24.09219 8.136901
# Make a data frame to draw significance line in boxplot (visually calculated)
invsimps_nums <- data.frame(a = c(1.15,1.15,1.85,1.85), b = c(83,85,85,83)) # WholePart vs WholeFree
invsimps_distribution_plot <-
ggplot(invsimps, aes(y = mean, x = fraction)) +
scale_fill_manual(values = fraction_colors) +
scale_shape_manual(values = season_shapes) +
geom_jitter(size = 3, aes(fill = fraction, shape = season), width = 0.2) +
geom_boxplot(alpha = 0.5, outlier.shape = NA, aes(fill = fraction)) +
scale_y_continuous(limits = c(0,85), breaks = seq(from = 0, to = 85, by = 20)) +
ylab("Inverse Simpson") + xlab("Fraction") +
geom_path(data = invsimps_nums, aes(x = a, y = b), linetype = 1, color = "#424645") +
annotate("text", x=1.5, y=80, size = 4, color = "#424645", label= "NS") +
theme(legend.position = "none", #axis.title.y = element_blank(),
axis.title.x = element_blank(), axis.text.x = element_blank()) +
coord_flip()
################ Inverse Simpson vs Community-wide (Per-Liter) Production
## 1. Extract the R2 and p-value from the linear model:
lm_lab_perliter_invsimps_PA <- paste("atop(R^2 ==", round(summary(lm_prod_vs_invsimps_PA)$adj.r.squared, digits = 2), ",",
"p ==", round(unname(summary(lm_prod_vs_invsimps_PA)$coefficients[,4][2]), digits = 4), ")")
## 2. Plot Inverse Simpson vs Community-wide (Per-Liter) Production
prod_vs_invsimps_plot <-
ggplot(invsimps, aes(x=mean, y=frac_bacprod)) +
geom_errorbarh(aes(xmin = mean - sd, xmax = mean + sd, color = fraction), alpha = 0.7) + # X-axis errorbars
geom_errorbar(aes(ymin = frac_bacprod - SD_frac_bacprod, ymax = frac_bacprod + SD_frac_bacprod, color = fraction)) + # Y-axis errorbars
geom_point(size = 3.5, color = "black", aes(fill = fraction, shape = season)) +
scale_color_manual(values = fraction_colors) +
scale_fill_manual(values = fraction_colors) +
scale_shape_manual(values = season_shapes) +
ylab("Community Production \n (μgC/L/day)") +
xlab("Inverse Simpson") +
geom_smooth(data=filter(invsimps, fraction == "Particle"), method='lm', color = "#FF6600", fill = "#FF6600") +
scale_x_continuous(limits = c(0,85), breaks = seq(from = 0, to = 85, by = 20)) +
# Add the R2 and p-value to the plot
annotate("text", x=70, y=45, label=lm_lab_perliter_invsimps_PA, parse = TRUE, color = "#FF6600", size = 4) +
theme(legend.position = "none", axis.title.x = element_blank(), axis.text.x = element_blank())
################ Inverse Simpson vs Per Capitra (Per-Cell) Production
## 1. Extract the R2 and p-value from the linear model:
lm_lab_percell_invsimps_PA <- paste("atop(R^2 ==", round(summary(lm_percell_prod_vs_invsimps_PA)$adj.r.squared, digits = 2), ",",
"p ==", round(unname(summary(lm_percell_prod_vs_invsimps_PA)$coefficients[,4][2]), digits = 4), ")")
## 2. Plot Inverse Simpson vs Per Capitra (Per-Cell) Production
percell_prod_vs_invsimps_plot <-
ggplot(invsimps, aes(x=mean, y=log10(fracprod_per_cell_noinf), fill = fraction, color = fraction)) +
geom_errorbarh(aes(xmin = mean - sd, xmax = mean + sd, color = fraction), alpha = 0.7) + # X-axis errorbars
geom_point(size = 3.5, color = "black", aes(shape = season)) +
scale_color_manual(values = fraction_colors, guide = TRUE) +
scale_fill_manual(values = fraction_colors, guide = FALSE) +
scale_shape_manual(values = season_shapes) +
ylab("log10(production/cell)\n (μgC/cell/day)") +
xlab("Inverse Simpson") +
geom_smooth(data=filter(invsimps, fraction == "Particle"), method='lm', color = "#FF6600", fill = "#FF6600") +
scale_x_continuous(limits = c(0,85), breaks = seq(from = 0, to = 85, by = 20)) +
#scale_y_continuous(limits = c(-8e-7,5e-6), breaks = seq(from = 0, to = 6e-7, by = 3e-7)) +
# Add the R2 and p-value to the plot
annotate("text", x=70, y=-7.5, label=lm_lab_percell_invsimps_PA, parse = TRUE, color = "#FF6600", size = 4) +
guides(color = guide_legend(override.aes = list(shape = 15))) +
theme(legend.title = element_blank(), legend.position ="bottom",
legend.text = element_text(size = 14))
# Plot Inverse simpson plots altogether
invsimps_plots <- plot_grid(invsimps_distribution_plot, prod_vs_invsimps_plot, percell_prod_vs_invsimps_plot,
labels = c("B", "D", "F"), ncol = 1, nrow = 3,
rel_heights = c(0.5, 0.8, 1.1),
align = "v")
# All 3 richness plots together
rich_plots <- plot_grid(rich_distribution_plot, prod_vs_rich_plot,
percell_prod_vs_rich_plot + theme(legend.position = "none"),
labels = c("A", "C", "E"), ncol = 1, nrow = 3,
rel_heights = c(0.5, 0.8, 1),
align = "v")
invsimps_plots_noyaxis <-
plot_grid(invsimps_distribution_plot + theme(axis.title.y = element_blank(), axis.text.y = element_blank()),
prod_vs_invsimps_plot + theme(axis.title.y = element_blank()),
percell_prod_vs_invsimps_plot + theme(axis.title.y = element_blank(), legend.position = "none"),
labels = c("B", "D", "F"), ncol = 1, nrow = 3,
rel_heights = c(0.5, 0.8, 1),
align = "v")
# Extract distribution plots of richness and inverse simpson
figure2_row1 <- plot_grid(rich_plots, invsimps_plots_noyaxis,
ncol = 2, nrow = 1, rel_widths = c(1, 0.75),
align = "h")
######## PLOT FIGURE 2
plot_grid(figure2_row1, season_legend,
ncol = 1, nrow = 2,
rel_heights = c(1, 0.05))
Figure 3
Unweighted ses.mpd
######################################################### SES MPD DISTRIBUTION: UNWEIGHTED
################ Figure 3A: Distribution of Particle and Free Unweighted Mean Pairwise distance
## 1. Is there a difference between particle and free Unweighted Mean Pairwise distance?
unweighted_fraction_wilcox <- wilcox.test(mpd.obs.z ~ fraction, data = unweighted_df)
unweighted_fraction_wilcox
##
## Wilcoxon rank sum test
##
## data: mpd.obs.z by fraction
## W = 30, p-value = 0.01449
## alternative hypothesis: true location shift is not equal to 0
# 2. What are the means of particle and free unweighted PD?
unweighted_df %>%
group_by(fraction) %>%
summarize(mean(mpd.obs.z))
## # A tibble: 2 x 2
## fraction `mean(mpd.obs.z)`
## <fctr> <dbl>
## 1 Particle -0.4971164
## 2 Free 0.4375532
## 3. Plot Unweighted Phylogenetic Diversity vs Richness
# Make a data frame to draw significance line in boxplot (visually calculated)
dat4 <- data.frame(a = c(1.15,1.15,1.85,1.85), b = c(1.6,1.7,1.7,1.6)) # WholePart vs WholeFree
unweight_distribution_plot <-
ggplot(unweighted_df, aes(y = mpd.obs.z, x = fraction)) +
scale_fill_manual(values = fraction_colors) +
geom_jitter(size = 3, aes(fill = fraction, shape = season), width = 0.2) +
geom_boxplot(alpha = 0.5, outlier.shape = NA, aes(fill = fraction)) +
scale_shape_manual(values = season_shapes) +
scale_y_continuous(limits = c(-1.5,1.75), breaks = seq(from = -1.5, to = 1.5, by = 0.5)) +
ylab("Standardized Effect Size \n Unweighted Mean Pairwise Dist") +
xlab("Fraction") +
geom_hline(yintercept = 0, linetype = "dashed", size = 1.5) +
##### Particle vs free per-cell production
geom_path(data = dat4, aes(x = a, y = b), linetype = 1, color = "#424645") +
annotate("text", x=1.5, y=1.35, size = 4, color = "#424645",
label= paste("p =", round(unweighted_fraction_wilcox$p.value, digits = 2))) +
theme(legend.position = "none",# axis.title.y = element_blank(),
axis.title.x = element_blank(), axis.text.x = element_blank()) +
coord_flip()
################ Figure 3B: Unweighted Phylogenetic Diversity vs Richness
## 1. Is there a relationship between richness and Unweighted Mean Pairwise distance?
lm_richness_vs_unweight <- lm(mean ~ mpd.obs.z, data = unweighted_df)
# Linear model results for particle-associated only
summary(lm(mean ~ mpd.obs.z, data = filter(unweighted_df, fraction == "Particle")))
##
## Call:
## lm(formula = mean ~ mpd.obs.z, data = filter(unweighted_df, fraction ==
## "Particle"))
##
## Residuals:
## Min 1Q Median 3Q Max
## -191.79 -112.16 -41.89 55.90 278.87
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 515.15 51.16 10.069 1.49e-06 ***
## mpd.obs.z -83.78 49.91 -1.679 0.124
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 155 on 10 degrees of freedom
## Multiple R-squared: 0.2199, Adjusted R-squared: 0.1419
## F-statistic: 2.818 on 1 and 10 DF, p-value: 0.1241
# Linear model results for free-living only
summary(lm(mean ~ mpd.obs.z, data = filter(unweighted_df, fraction == "Free")))
##
## Call:
## lm(formula = mean ~ mpd.obs.z, data = filter(unweighted_df, fraction ==
## "Free"))
##
## Residuals:
## Min 1Q Median 3Q Max
## -100.15 -66.22 -18.70 43.65 172.92
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 337.088 42.753 7.884 1.34e-05 ***
## mpd.obs.z 3.053 77.510 0.039 0.969
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 90.18 on 10 degrees of freedom
## Multiple R-squared: 0.0001551, Adjusted R-squared: -0.09983
## F-statistic: 0.001551 on 1 and 10 DF, p-value: 0.9694
## 2. Extract the R2 and p-value from the linear model:
lm_lab_richness_vs_unweight<- paste("atop(R^2 ==", round(summary(lm_richness_vs_unweight)$adj.r.squared, digits = 2), ",",
"p ==", round(unname(summary(lm_richness_vs_unweight)$coefficients[,4][2]), digits = 3), ")")
## 3. Plot Unweighted Phylogenetic Diversity vs Richness
unweight_rich_vs_mpd_plot <-
ggplot(unweighted_df, aes(y = mean, x = mpd.obs.z)) +
geom_errorbar(aes(ymin = mean - sd, ymax = mean + sd, color = fraction), alpha = 0.7) + # X-axis errorbars
geom_vline(xintercept = 0, linetype = "dashed", size = 1.5) +
geom_point(size = 3, aes(fill = fraction, shape = season)) + ylab("\n Observed Richness") +
scale_shape_manual(values = season_shapes) +
xlab("Standardized Effect Size \n Unweighted Mean Pairwise Dist") +
scale_fill_manual(values = fraction_colors) +
geom_smooth(method = "lm", color = "#424645", fill = "#424645", alpha = 0.3) +
scale_x_continuous(limits = c(-1.5,1.75), breaks = seq(from = -1.5, to = 1.5, by = 0.5)) +
# Add the R2 and p-value to the plot
annotate("text", x=0.75, y=750, label=lm_lab_richness_vs_unweight, parse = TRUE, color = "#424645", size = 4) +
theme(legend.title = element_blank(), legend.position = "none",
axis.text.x = element_blank(), axis.title.x = element_blank())
################ Figure 3C: Unweighted Mean Pairwise distance vs Community-Wide Production
# Is there a relationship between Production and Unweighted Mean Pairwise distance?
summary(lm(frac_bacprod ~ mpd.obs.z, data = unweighted_df)) # NS
##
## Call:
## lm(formula = frac_bacprod ~ mpd.obs.z, data = unweighted_df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -20.416 -9.547 -3.120 8.193 44.030
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 17.125 3.102 5.520 1.51e-05 ***
## mpd.obs.z 3.901 3.768 1.035 0.312
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 15.19 on 22 degrees of freedom
## Multiple R-squared: 0.04645, Adjusted R-squared: 0.003106
## F-statistic: 1.072 on 1 and 22 DF, p-value: 0.3118
# Linear model results for particle-associated only
summary(lm(frac_bacprod ~ mpd.obs.z, data = filter(unweighted_df, fraction == "Particle")))
##
## Call:
## lm(formula = frac_bacprod ~ mpd.obs.z, data = filter(unweighted_df,
## fraction == "Particle"))
##
## Residuals:
## Min 1Q Median 3Q Max
## -9.145 -4.312 -1.824 3.402 19.527
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 8.213 2.617 3.138 0.0105 *
## mpd.obs.z -3.511 2.553 -1.376 0.1990
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7.928 on 10 degrees of freedom
## Multiple R-squared: 0.1591, Adjusted R-squared: 0.07501
## F-statistic: 1.892 on 1 and 10 DF, p-value: 0.199
# Linear model results for free-living only
summary(lm(frac_bacprod ~ mpd.obs.z, data = filter(unweighted_df, fraction == "Free")))
##
## Call:
## lm(formula = frac_bacprod ~ mpd.obs.z, data = filter(unweighted_df,
## fraction == "Free"))
##
## Residuals:
## Min 1Q Median 3Q Max
## -21.996 -14.939 2.192 8.751 37.004
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 18.191 8.398 2.166 0.0555 .
## mpd.obs.z 13.410 15.225 0.881 0.3991
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 17.71 on 10 degrees of freedom
## Multiple R-squared: 0.072, Adjusted R-squared: -0.0208
## F-statistic: 0.7758 on 1 and 10 DF, p-value: 0.3991
unweight_prod_vs_mpd_plot <-
ggplot(unweighted_df, aes(y = frac_bacprod, x = mpd.obs.z, fill = fraction)) +
geom_errorbar(aes(ymin = frac_bacprod - SD_frac_bacprod, ymax = frac_bacprod + SD_frac_bacprod, color = fraction), alpha = 0.7) + # X-axis errorbars
geom_vline(xintercept = 0, linetype = "dashed", size = 1.5) +
geom_point(size = 3, aes(shape = season)) +
scale_shape_manual(values = season_shapes) +
ylab("\n Community Production \n (μgC/L/day)") +
xlab("Standardized Effect Size \n Unweighted Mean Pairwise Dist") +
scale_fill_manual(values = fraction_colors) +
scale_x_continuous(limits = c(-1.5,1.75), breaks = seq(from = -1.5, to = 1.5, by = 0.5)) +
theme(legend.title = element_blank(), legend.position = "none",
axis.text.x = element_blank(), axis.title.x = element_blank())
################ Figure 3D: Unweighted Phylogenetic Diversity vs Per Capita (Per-Cell) Production
## 1. Run the linear model: Is there a relationship between PER-CELL PRODUCTION and Unweighted Mean Pairwise distance?
unweight_vs_percell <- lm(log10(fracprod_per_cell_noinf) ~ mpd.obs.z, data = unweighted_df)
# Linear model results for particle-associated only
summary(lm(log10(fracprod_per_cell_noinf) ~ mpd.obs.z, data = filter(unweighted_df, fraction == "Particle")))
##
## Call:
## lm(formula = log10(fracprod_per_cell_noinf) ~ mpd.obs.z, data = filter(unweighted_df,
## fraction == "Particle"))
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.3795 -0.2419 -0.1651 0.0443 1.1815
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -6.9248 0.1711 -40.475 1.71e-11 ***
## mpd.obs.z -0.3299 0.1627 -2.028 0.0732 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4649 on 9 degrees of freedom
## (1 observation deleted due to missingness)
## Multiple R-squared: 0.3136, Adjusted R-squared: 0.2373
## F-statistic: 4.112 on 1 and 9 DF, p-value: 0.07319
# Linear model results for free-living only
summary(lm(log10(fracprod_per_cell_noinf) ~ mpd.obs.z, data = filter(unweighted_df, fraction == "Free")))
##
## Call:
## lm(formula = log10(fracprod_per_cell_noinf) ~ mpd.obs.z, data = filter(unweighted_df,
## fraction == "Free"))
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.63978 -0.29782 0.07694 0.17610 0.76503
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -7.55622 0.19603 -38.545 3.3e-12 ***
## mpd.obs.z -0.04303 0.35540 -0.121 0.906
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4135 on 10 degrees of freedom
## Multiple R-squared: 0.001463, Adjusted R-squared: -0.09839
## F-statistic: 0.01466 on 1 and 10 DF, p-value: 0.906
## 2. Extract the R2 and p-value from the linear model:
lm_lab_percell_unweightPD <- paste("atop(R^2 ==", round(summary(unweight_vs_percell)$adj.r.squared, digits = 2), ",",
"p ==", round(unname(summary(unweight_vs_percell)$coefficients[,4][2]), digits = 4), ")")
## 3. Plot Unweighted Phylogenetic Diversity vs Per Capitra (Per-Cell) Production
unweight_percell_vs_mpd_plot <-
ggplot(unweighted_df,
aes(y = log10(fracprod_per_cell_noinf), x = mpd.obs.z)) +
geom_vline(xintercept = 0, linetype = "dashed", size = 1.5) +
geom_point(size = 3, aes(fill = fraction, shape = season)) +
ylab("\n log10(Per-Capita Production)\n (μgC/cell/day)") +
xlab("Unweighted Phylogenetic Diversity") +
scale_fill_manual(values = fraction_colors) +
scale_shape_manual(values = season_shapes) +
geom_smooth(method = "lm", color="#424645", fill = "#424645", alpha = 0.3) +
scale_y_continuous(limits = c(-8.5,-5), breaks = c(-8, -7, -6)) +
scale_x_continuous(limits = c(-1.5,1.75), breaks = seq(from = -1.5, to = 1.5, by = 0.5)) +
# Add the R2 and p-value to the plot
annotate("text", x=0.75, y=-6, label=lm_lab_percell_unweightPD, parse = TRUE, color = "#424645", size = 4) +
theme(legend.title = element_blank(), legend.position ="bottom",
legend.text = element_text(size = 14))
######## FIGURE 3
figure3_row1 <- plot_grid(unweight_distribution_plot, unweight_rich_vs_mpd_plot, unweight_prod_vs_mpd_plot,
unweight_percell_vs_mpd_plot + theme(legend.position = "none"),
labels = c("A", "B", "C", "D"), ncol = 1, nrow = 4,
rel_heights = c(0.5, 1, 1, 1.2),
align = "v")
plot_grid(figure3_row1, season_legend,
ncol = 1, nrow = 2,
rel_heights = c(1, 0.05))
Lasso Regressions
Prepare the data for Lasso Regressions
unweight_vals <- unweighted_df %>%
dplyr::select(mpd.obs.z, norep_filter_name) %>%
rename(Unweighted_PD = mpd.obs.z)
weighted_vals <- weighted_df %>%
dplyr::select(mpd.obs.z, norep_filter_name) %>%
rename(Weighted_PD = mpd.obs.z)
wide_all_divs <- otu_alphadiv %>%
dplyr::select(norep_filter_name ,mean, measure) %>%
spread(measure, mean)
lasso_data_df <- metadata_pca %>%
left_join(wide_all_divs, by = "norep_filter_name") %>%
left_join(unweight_vals, by = "norep_filter_name") %>%
left_join(weighted_vals, by = "norep_filter_name") %>%
dplyr::select(-c(project, year, Date, limnion, norep_water_name, dnaconcrep1, perc_attached_bacprod,
SD_perc_attached_bacprod, SE_total_bac_abund, SE_perc_attached_abund, SE_attached_bac))
## Warning: Column `norep_filter_name` joining character vector and factor, coercing into character vector
### PARTICLE DATA
lasso_data_df_particle <- lasso_data_df %>%
filter(fraction == "Particle" ) %>%
dplyr::select(-fraction)
lasso_data_df_particle_noprod <- lasso_data_df_particle %>%
dplyr::select(-c(fracprod_per_cell, fracprod_per_cell_noinf,
tot_bacprod, SD_tot_bacprod,SD_frac_bacprod,
norep_filter_name, BGA_cellspermL, Turb_NTU,
lakesite, season, station))
percell_lasso_data_df_particle_noprod <- lasso_data_df_particle %>%
dplyr::select(-c(fracprod_per_cell, frac_bacprod,
tot_bacprod, SD_tot_bacprod,SD_frac_bacprod,
lakesite, season, norep_filter_name, station, BGA_cellspermL, Turb_NTU)) %>%
dplyr::filter(Temp_C != 14.28) # Remove the missing row of data :( )
### FREE DATA
lasso_data_df_free <- lasso_data_df %>%
filter(fraction == "Free" ) %>%
dplyr::select(-fraction)
lasso_data_df_free_noprod <- lasso_data_df_free %>%
dplyr::select(-c(fracprod_per_cell, fracprod_per_cell_noinf,
tot_bacprod, SD_tot_bacprod,SD_frac_bacprod,
norep_filter_name, BGA_cellspermL, Turb_NTU,
lakesite, season, station))
percell_lasso_data_df_free_noprod <- lasso_data_df_free %>%
dplyr::select(-c(fracprod_per_cell, frac_bacprod,
tot_bacprod, SD_tot_bacprod,SD_frac_bacprod,
lakesite, season, norep_filter_name, station, BGA_cellspermL, Turb_NTU)) %>%
dplyr::filter(Temp_C != 14.28) # Remove the missing row of data :( )
## ALL DATA
all_dat_lasso_comm <- lasso_data_df %>%
dplyr::select(-c(fracprod_per_cell, fracprod_per_cell_noinf,
tot_bacprod, SD_tot_bacprod,SD_frac_bacprod,
norep_filter_name, BGA_cellspermL, Turb_NTU,
lakesite, season, station, fraction))
all_dat_lasso_percapita <- lasso_data_df %>%
dplyr::select(-c(fracprod_per_cell, frac_bacprod,
tot_bacprod, SD_tot_bacprod,SD_frac_bacprod, fraction,
lakesite, season, norep_filter_name, station, BGA_cellspermL, Turb_NTU)) %>%
dplyr::filter(Temp_C != 14.28) # Remove the missing row of data :( )
Perform Lasso regression
Bulk Community Production: Particle
# Set the seed for reproducibility of the grid values
set.seed(777)
################ PREPARE DATA ################
# Subset data needed and scale all o
scaled_comm_data <-
lasso_data_df_particle_noprod %>% # Use data only for the particle samples
scale() %>% # Scale all of the variables to have mean =0 and sd = 1
as.data.frame() # Make it a dataframe so that model.matrix function works (does not take a matrix)
# Set model parameters for community level data
## NOTE: there cannot be any data with NA
x = model.matrix(frac_bacprod ~ ., scaled_comm_data)[,-1]
y = scaled_comm_data$frac_bacprod
grid = 10^seq(10,-2,length = 100)
################ PREPARE TRAINING & TESTING DATA FOR CROSS VALIDATION ################
# Pull out test and training sets for cross validation
# We will use half the set to train the model and the 2nd half of the dataset to test the model
train <- sample(1:nrow(x), nrow(x)/2)
test <- -train
y_test <- y[test]
################ LASSO ################
# Run lasso regression with alpha = 1
lasso_divs_train <- glmnet(x[train,], y[train], alpha = 1, lambda = grid, standardize = FALSE)
par(mfrow = c(1,2))
plot(lasso_divs_train)
# Cross validation
cv_lasso_divs <- cv.glmnet(x[train,], y[train], alpha = 1)
## Warning: Option grouped=FALSE enforced in cv.glmnet, since < 3 observations per fold
plot(cv_lasso_divs)
best_lasso_lambda <- cv_lasso_divs$lambda.min # Minimum lambda
# https://stats.stackexchange.com/questions/138569/why-is-lambda-plus-1-standard-error-a-recommended-value-for-lambda-in-an-elastic
lasso_lambda_1se <- cv_lasso_divs$lambda.1se # simplest model whose accuracy is comparable with the best model
best_lasso_lambda == lasso_lambda_1se
## [1] TRUE
lasso_divs_pred <- predict(lasso_divs_train, s = best_lasso_lambda, newx = x[test,])
mean((lasso_divs_pred - y_test)^2) # Test
## [1] 1.375897
## Run lasso on the entire dataset with the best lambda value
lasso_divs <- glmnet(x, y, alpha = 1, lambda = best_lasso_lambda, standardize = FALSE)
# What are the lasso coefficients? (Anything with a . is not selected by the model)
predict(lasso_divs, type = "coefficients", s = best_lasso_lambda)
## 34 x 1 sparse Matrix of class "dgCMatrix"
## 1
## (Intercept) -1.478053e-17
## Sample_depth_m .
## Temp_C .
## SpCond_uSpercm .
## TDS_mgperL .
## pH .
## ORP_mV .
## Chl_Lab_ugperL .
## Cl_mgperL .
## SO4_mgperL .
## NO3_mgperL .
## NH3_mgperL .
## TKN_mgperL .
## SRP_ugperL .
## TP_ugperL .
## Alk_mgperL .
## DO_mgperL .
## DO_percent .
## total_bac_abund .
## attached_bac .
## perc_attached_abund .
## fraction_bac_abund .
## PC1 .
## PC2 .
## PC3 .
## PC4 .
## PC5 .
## PC6 .
## Richness .
## Shannon_Entropy .
## Inverse_Simpson 1.300983e-01
## Simpsons_Evenness .
## Unweighted_PD .
## Weighted_PD .
## Run lasso on the entire dataset with the best lambda value
lasso_divs_1se <- glmnet(x, y, alpha = 1, lambda = lasso_lambda_1se, standardize = FALSE)
# What are the lasso coefficients? (Anything with a . is not selected by the model)
predict(lasso_divs_1se, type = "coefficients", s = lasso_lambda_1se)
## 34 x 1 sparse Matrix of class "dgCMatrix"
## 1
## (Intercept) -1.478053e-17
## Sample_depth_m .
## Temp_C .
## SpCond_uSpercm .
## TDS_mgperL .
## pH .
## ORP_mV .
## Chl_Lab_ugperL .
## Cl_mgperL .
## SO4_mgperL .
## NO3_mgperL .
## NH3_mgperL .
## TKN_mgperL .
## SRP_ugperL .
## TP_ugperL .
## Alk_mgperL .
## DO_mgperL .
## DO_percent .
## total_bac_abund .
## attached_bac .
## perc_attached_abund .
## fraction_bac_abund .
## PC1 .
## PC2 .
## PC3 .
## PC4 .
## PC5 .
## PC6 .
## Richness .
## Shannon_Entropy .
## Inverse_Simpson 1.300983e-01
## Simpsons_Evenness .
## Unweighted_PD .
## Weighted_PD .
The lasso model uses Inverse Simpson as the best and only predictor of production!
Community-Wide Production: Free
# Set the seed for reproducibility of the grid values
set.seed(777)
################ PREPARE DATA ################
# Subset data needed and scale all o
scaled_comm_data_free <-
lasso_data_df_free_noprod %>% # Use data only for the free-living samples
scale() %>% # Scale all of the variables to have mean = 0 and sd = 1
as.data.frame() # Make it a dataframe so that model.matrix function works (does not take a matrix)
# Set model parameters for community level data
## NOTE: there cannot be any data with NA
x <- model.matrix(frac_bacprod ~ ., scaled_comm_data_free)[,-1]
y <- scaled_comm_data_free$frac_bacprod
grid <- 10^seq(10,-2,length = 100)
################ PREPARE TRAINING & TESTING DATA FOR CROSS VALIDATION ################
# Pull out test and training sets for cross validation
# We will use half the set to train the model and the 2nd half of the dataset to test the model
train <- sample(1:nrow(x), nrow(x)/2)
test <- -train
y_test <- y[test]
################ LASSO ################
# Run lasso regression with alpha = 1
lasso_divs_train_free_comm <- glmnet(x[train,], y[train], alpha = 1, lambda = grid, standardize = FALSE)
par(mfrow = c(1,2))
plot(lasso_divs_train_free_comm)
# Cross validation
cv_lasso_divs <- cv.glmnet(x[train,], y[train], alpha = 1)
## Warning: Option grouped=FALSE enforced in cv.glmnet, since < 3 observations per fold
plot(cv_lasso_divs)
best_lasso_lambda <- cv_lasso_divs$lambda.min
# https://stats.stackexchange.com/questions/138569/why-is-lambda-plus-1-standard-error-a-recommended-value-for-lambda-in-an-elastic
lasso_lambda_1se <- cv_lasso_divs$lambda.1se # simplest model whose accuracy is comparable with the best model
best_lasso_lambda == lasso_lambda_1se
## [1] FALSE
lasso_divs_pred <- predict(lasso_divs_train_free_comm, s = best_lasso_lambda, newx = x[test,])
mean((lasso_divs_pred - y_test)^2) # Test
## [1] 1.659002
## Run lasso on the entire dataset with the best lambda value
lasso_divs <- glmnet(x, y, alpha = 1, lambda = best_lasso_lambda, standardize = FALSE)
# What are the lasso coefficients? (Anything with a . is not selected by the model)
predict(lasso_divs, type = "coefficients", s = best_lasso_lambda)
## 34 x 1 sparse Matrix of class "dgCMatrix"
## 1
## (Intercept) -1.028226e-15
## Sample_depth_m .
## Temp_C .
## SpCond_uSpercm .
## TDS_mgperL .
## pH -5.122124e-01
## ORP_mV .
## Chl_Lab_ugperL .
## Cl_mgperL .
## SO4_mgperL .
## NO3_mgperL 2.183338e-02
## NH3_mgperL .
## TKN_mgperL .
## SRP_ugperL .
## TP_ugperL .
## Alk_mgperL .
## DO_mgperL .
## DO_percent .
## total_bac_abund .
## attached_bac .
## perc_attached_abund .
## fraction_bac_abund .
## PC1 .
## PC2 .
## PC3 .
## PC4 .
## PC5 2.171826e-01
## PC6 .
## Richness .
## Shannon_Entropy .
## Inverse_Simpson .
## Simpsons_Evenness .
## Unweighted_PD 1.397650e-01
## Weighted_PD .
## Run lasso on the entire dataset with the best lambda value
lasso_divs_1se <- glmnet(x, y, alpha = 1, lambda = lasso_lambda_1se, standardize = FALSE)
# What are the lasso coefficients? (Anything with a . is not selected by the model)
predict(lasso_divs_1se, type = "coefficients", s = lasso_lambda_1se)
## 34 x 1 sparse Matrix of class "dgCMatrix"
## 1
## (Intercept) -4.292254e-16
## Sample_depth_m .
## Temp_C .
## SpCond_uSpercm .
## TDS_mgperL .
## pH -2.218982e-01
## ORP_mV .
## Chl_Lab_ugperL .
## Cl_mgperL .
## SO4_mgperL .
## NO3_mgperL .
## NH3_mgperL .
## TKN_mgperL .
## SRP_ugperL .
## TP_ugperL .
## Alk_mgperL .
## DO_mgperL .
## DO_percent .
## total_bac_abund .
## attached_bac .
## perc_attached_abund .
## fraction_bac_abund .
## PC1 .
## PC2 .
## PC3 .
## PC4 .
## PC5 2.514746e-02
## PC6 .
## Richness .
## Shannon_Entropy .
## Inverse_Simpson .
## Simpsons_Evenness .
## Unweighted_PD .
## Weighted_PD .
The lasso model for predicting the community-wide heterotrophic bacterial production for the free-living habitat uses pH and PC5 as the best and only two predictors!
Per-capita Production: Particle
scaled_percapita_data <- percell_lasso_data_df_particle_noprod %>%
dplyr::filter(!is.na(fracprod_per_cell_noinf)) %>%
mutate(log10_percell = log10(fracprod_per_cell_noinf)) %>%
dplyr::select(-c(fracprod_per_cell_noinf)) %>%
scale() %>%
as.data.frame()
set.seed(777)
# Set model parameters for community level data
## NOTE: there cannot be any data with NA
x = model.matrix(log10_percell ~ ., scaled_percapita_data)[,-1]
y = scaled_percapita_data$log10_percell
grid = 10^seq(10,-2,length = 100)
# Pull out test and training sets for cross validation
# We will use half the set to train the model and the 2nd half of the dataset to test the model
train <- sample(1:nrow(x), nrow(x)/2)
test <- -train
y_test <- y[test]
################ LASSO
# Run lasso regression with alpha = 1
lasso_divs_train <- glmnet(x[train,], y[train], alpha = 1, lambda = grid, standardize = FALSE)
par(mfrow = c(1,2))
plot(lasso_divs_train)
# Cross validation
cv_lasso_divs <- cv.glmnet(x[train,], y[train], alpha = 1)
## Warning: Option grouped=FALSE enforced in cv.glmnet, since < 3 observations per fold
plot(cv_lasso_divs)
best_lasso_lambda <- cv_lasso_divs$lambda.min
# https://stats.stackexchange.com/questions/138569/why-is-lambda-plus-1-standard-error-a-recommended-value-for-lambda-in-an-elastic
lasso_lambda_1se <- cv_lasso_divs$lambda.1se # simplest model whose accuracy is comparable with the best model
best_lasso_lambda == lasso_lambda_1se
## [1] FALSE
lasso_divs_pred <- predict(lasso_divs_train, s = best_lasso_lambda, newx = x[test,])
mean((lasso_divs_pred - y_test)^2)
## [1] 1.743068
## Run lasso on the entire dataset with the best lasso
lasso_divs <- glmnet(x, y, alpha = 1, lambda = best_lasso_lambda, standardize = FALSE)
# What are the lasso coefficients?
predict(lasso_divs, type = "coefficients", s = best_lasso_lambda)
## 34 x 1 sparse Matrix of class "dgCMatrix"
## 1
## (Intercept) -7.322846e-16
## Sample_depth_m .
## Temp_C -1.371144e-01
## SpCond_uSpercm .
## TDS_mgperL .
## pH .
## ORP_mV .
## Chl_Lab_ugperL .
## Cl_mgperL .
## SO4_mgperL .
## NO3_mgperL .
## NH3_mgperL .
## TKN_mgperL .
## SRP_ugperL .
## TP_ugperL .
## Alk_mgperL .
## DO_mgperL .
## DO_percent .
## total_bac_abund -5.741093e-03
## attached_bac -3.957476e-02
## perc_attached_abund .
## fraction_bac_abund -1.164412e-16
## PC1 .
## PC2 .
## PC3 .
## PC4 .
## PC5 1.240751e-02
## PC6 .
## Richness .
## Shannon_Entropy .
## Inverse_Simpson 4.388114e-01
## Simpsons_Evenness .
## Unweighted_PD .
## Weighted_PD .
## Run lasso on the entire dataset with the best lasso
lasso_divs_1se <- glmnet(x, y, alpha = 1, lambda = lasso_lambda_1se, standardize = FALSE)
# What are the lasso coefficients?
predict(lasso_divs_1se, type = "coefficients", s = lasso_lambda_1se)
## 34 x 1 sparse Matrix of class "dgCMatrix"
## 1
## (Intercept) -7.173111e-16
## Sample_depth_m .
## Temp_C -8.717767e-02
## SpCond_uSpercm .
## TDS_mgperL .
## pH .
## ORP_mV .
## Chl_Lab_ugperL .
## Cl_mgperL .
## SO4_mgperL .
## NO3_mgperL .
## NH3_mgperL .
## TKN_mgperL .
## SRP_ugperL .
## TP_ugperL .
## Alk_mgperL .
## DO_mgperL .
## DO_percent .
## total_bac_abund .
## attached_bac .
## perc_attached_abund .
## fraction_bac_abund .
## PC1 .
## PC2 .
## PC3 .
## PC4 .
## PC5 .
## PC6 .
## Richness .
## Shannon_Entropy .
## Inverse_Simpson 3.530134e-01
## Simpsons_Evenness .
## Unweighted_PD .
## Weighted_PD .
Temperature and Inverse Simpson’s Index are the two best predictors of particle-associated per-capita production!
Per-capita Production: Free
scaled_percapita_data_free <- percell_lasso_data_df_free_noprod %>%
dplyr::filter(!is.na(fracprod_per_cell_noinf)) %>%
mutate(log10_percell = log10(fracprod_per_cell_noinf)) %>%
dplyr::select(-c(fracprod_per_cell_noinf)) %>%
scale() %>%
as.data.frame()
set.seed(777)
# Set model parameters for community level data
## NOTE: there cannot be any data with NA
x = model.matrix(log10_percell ~ ., scaled_percapita_data_free)[,-1]
y = scaled_percapita_data_free$log10_percell
grid = 10^seq(10,-2,length = 100)
# Pull out test and training sets for cross validation
# We will use half the set to train the model and the 2nd half of the dataset to test the model
train <- sample(1:nrow(x), nrow(x)/2)
test <- -train
y_test <- y[test]
################ LASSO
# Run lasso regression with alpha = 1
lasso_divs_train <- glmnet(x[train,], y[train], alpha = 1, lambda = grid, standardize = FALSE)
par(mfrow = c(1,2))
plot(lasso_divs_train)
# Cross validation
cv_lasso_divs <- cv.glmnet(x[train,], y[train], alpha = 1)
## Warning: Option grouped=FALSE enforced in cv.glmnet, since < 3 observations per fold
plot(cv_lasso_divs)
best_lasso_lambda <- cv_lasso_divs$lambda.min
# https://stats.stackexchange.com/questions/138569/why-is-lambda-plus-1-standard-error-a-recommended-value-for-lambda-in-an-elastic
lasso_lambda_1se <- cv_lasso_divs$lambda.1se # simplest model whose accuracy is comparable with the best model
best_lasso_lambda == lasso_lambda_1se
## [1] TRUE
lasso_divs_pred <- predict(lasso_divs_train, s = best_lasso_lambda, newx = x[test,])
mean((lasso_divs_pred - y_test)^2)
## [1] 1.159787
## Run lasso on the entire dataset
lasso_divs <- glmnet(x, y, alpha = 1, lambda = best_lasso_lambda, standardize = FALSE)
# What are the lasso coefficients?
predict(lasso_divs, type = "coefficients", s = best_lasso_lambda)
## 34 x 1 sparse Matrix of class "dgCMatrix"
## 1
## (Intercept) 1.365298e-15
## Sample_depth_m .
## Temp_C .
## SpCond_uSpercm .
## TDS_mgperL .
## pH -1.017130e-01
## ORP_mV .
## Chl_Lab_ugperL .
## Cl_mgperL .
## SO4_mgperL .
## NO3_mgperL .
## NH3_mgperL .
## TKN_mgperL .
## SRP_ugperL .
## TP_ugperL .
## Alk_mgperL .
## DO_mgperL .
## DO_percent .
## total_bac_abund .
## attached_bac .
## perc_attached_abund .
## fraction_bac_abund .
## PC1 .
## PC2 .
## PC3 .
## PC4 .
## PC5 .
## PC6 .
## Richness .
## Shannon_Entropy .
## Inverse_Simpson .
## Simpsons_Evenness .
## Unweighted_PD .
## Weighted_PD .
## Run lasso on the entire dataset
lasso_divs_1se <- glmnet(x, y, alpha = 1, lambda = lasso_lambda_1se, standardize = FALSE)
# What are the lasso coefficients?
predict(lasso_divs_1se, type = "coefficients", s = lasso_lambda_1se)
## 34 x 1 sparse Matrix of class "dgCMatrix"
## 1
## (Intercept) 1.365298e-15
## Sample_depth_m .
## Temp_C .
## SpCond_uSpercm .
## TDS_mgperL .
## pH -1.017130e-01
## ORP_mV .
## Chl_Lab_ugperL .
## Cl_mgperL .
## SO4_mgperL .
## NO3_mgperL .
## NH3_mgperL .
## TKN_mgperL .
## SRP_ugperL .
## TP_ugperL .
## Alk_mgperL .
## DO_mgperL .
## DO_percent .
## total_bac_abund .
## attached_bac .
## perc_attached_abund .
## fraction_bac_abund .
## PC1 .
## PC2 .
## PC3 .
## PC4 .
## PC5 .
## PC6 .
## Richness .
## Shannon_Entropy .
## Inverse_Simpson .
## Simpsons_Evenness .
## Unweighted_PD .
## Weighted_PD .
pH is the only predictors of free-living per-capita production!
Community-wide Production: All Samples
# Set the seed for reproducibility of the grid values
set.seed(777)
################ PREPARE DATA ################
# Subset data needed and scale all o
scaled_comm_data_ALL <-
all_dat_lasso_comm %>% # Use community-wide production data only for the all samples
scale() %>% # Scale all of the variables to have mean =0 and sd = 1
as.data.frame() # Make it a dataframe so that model.matrix function works (does not take a matrix)
# Set model parameters for community level data
## NOTE: there cannot be any data with NA
x <- model.matrix(frac_bacprod ~ ., scaled_comm_data_ALL)[,-1]
y <- scaled_comm_data_ALL$frac_bacprod
grid <- 10^seq(10,-2,length = 100)
################ PREPARE TRAINING & TESTING DATA FOR CROSS VALIDATION ################
# Pull out test and training sets for cross validation
# We will use half the set to train the model and the 2nd half of the dataset to test the model
train <- sample(1:nrow(x), nrow(x)/2)
test <- -train
y_test <- y[test]
################ LASSO ################
# Run lasso regression with alpha = 1
lasso_divs_train <- glmnet(x[train,], y[train], alpha = 1, lambda = grid, standardize = FALSE)
par(mfrow = c(1,2))
plot(lasso_divs_train)
# Cross validation
cv_lasso_divs <- cv.glmnet(x[train,], y[train], alpha = 1)
## Warning: Option grouped=FALSE enforced in cv.glmnet, since < 3 observations per fold
plot(cv_lasso_divs)
best_lasso_lambda <- cv_lasso_divs$lambda.min
# https://stats.stackexchange.com/questions/138569/why-is-lambda-plus-1-standard-error-a-recommended-value-for-lambda-in-an-elastic
lasso_lambda_1se <- cv_lasso_divs$lambda.1se # simplest model whose accuracy is comparable with the best model
best_lasso_lambda == lasso_lambda_1se
## [1] FALSE
best_lasso_lambda
## [1] 0.3946859
lasso_lambda_1se
## [1] 0.7569721
lasso_divs_pred <- predict(lasso_divs_train, s = best_lasso_lambda, newx = x[test,])
mean((lasso_divs_pred - y_test)^2)
## [1] 0.5662862
## Run lasso on the entire dataset with the best lambda value
lasso_divs <- glmnet(x, y, alpha = 1, lambda = best_lasso_lambda, standardize = FALSE)
# What are the lasso coefficients? (Anything with a . is not selected by the model)
predict(lasso_divs, type = "coefficients", s = best_lasso_lambda)
## 34 x 1 sparse Matrix of class "dgCMatrix"
## 1
## (Intercept) -4.328268e-16
## Sample_depth_m .
## Temp_C .
## SpCond_uSpercm .
## TDS_mgperL .
## pH -2.134918e-01
## ORP_mV .
## Chl_Lab_ugperL .
## Cl_mgperL .
## SO4_mgperL .
## NO3_mgperL .
## NH3_mgperL .
## TKN_mgperL .
## SRP_ugperL .
## TP_ugperL .
## Alk_mgperL .
## DO_mgperL .
## DO_percent .
## total_bac_abund .
## attached_bac .
## perc_attached_abund .
## fraction_bac_abund 4.841677e-02
## PC1 .
## PC2 .
## PC3 .
## PC4 .
## PC5 .
## PC6 .
## Richness .
## Shannon_Entropy .
## Inverse_Simpson .
## Simpsons_Evenness .
## Unweighted_PD .
## Weighted_PD .
## Run lasso on the entire dataset with the best lambda value
lasso_divs_1se <- glmnet(x, y, alpha = 1, lambda = lasso_lambda_1se, standardize = FALSE)
# What are the lasso coefficients? (Anything with a . is not selected by the model)
predict(lasso_divs_1se, type = "coefficients", s = lasso_lambda_1se)
## 34 x 1 sparse Matrix of class "dgCMatrix"
## 1
## (Intercept) 2.081668e-17
## Sample_depth_m 0.000000e+00
## Temp_C .
## SpCond_uSpercm .
## TDS_mgperL .
## pH .
## ORP_mV .
## Chl_Lab_ugperL .
## Cl_mgperL .
## SO4_mgperL .
## NO3_mgperL .
## NH3_mgperL .
## TKN_mgperL .
## SRP_ugperL .
## TP_ugperL .
## Alk_mgperL .
## DO_mgperL .
## DO_percent .
## total_bac_abund .
## attached_bac .
## perc_attached_abund .
## fraction_bac_abund .
## PC1 .
## PC2 .
## PC3 .
## PC4 .
## PC5 .
## PC6 .
## Richness .
## Shannon_Entropy .
## Inverse_Simpson .
## Simpsons_Evenness .
## Unweighted_PD .
## Weighted_PD .
The “one-standard-error” approach to lasso is not effective in the above approach and therefore, we will use the lambda.min.
Per-capita Production: All Samples
set.seed(777)
scaled_percapita_ALL <- all_dat_lasso_percapita %>%
dplyr::filter(!is.na(fracprod_per_cell_noinf)) %>%
mutate(log10_percell = log10(fracprod_per_cell_noinf)) %>%
dplyr::select(-c(fracprod_per_cell_noinf, perc_attached_abund, fraction_bac_abund, attached_bac)) %>%
scale() %>%
as.data.frame()
# Set model parameters for community level data
## NOTE: there cannot be any data with NA
x = model.matrix(log10_percell ~ ., scaled_percapita_ALL)[,-1]
y = scaled_percapita_ALL$log10_percell
grid = 10^seq(10,-2,length = 100)
# Pull out test and training sets for cross validation
# We will use half the set to train the model and the 2nd half of the dataset to test the model
train <- sample(1:nrow(x), nrow(x)/2)
test <- -train
################ LASSO
# Run lasso regression with alpha = 1
lasso_divs_train <- glmnet(x[train,], y[train], alpha = 1, lambda = grid, standardize = FALSE)
par(mfrow = c(1,2))
plot(lasso_divs_train)
# Cross validation
cv_lasso_divs <- cv.glmnet(x[train,], y[train], alpha = 1)
## Warning: Option grouped=FALSE enforced in cv.glmnet, since < 3 observations per fold
plot(cv_lasso_divs)
best_lasso_lambda <- cv_lasso_divs$lambda.min
# https://stats.stackexchange.com/questions/138569/why-is-lambda-plus-1-standard-error-a-recommended-value-for-lambda-in-an-elastic
lasso_lambda_1se <- cv_lasso_divs$lambda.1se # simplest model whose accuracy is comparable with the best model
best_lasso_lambda == lasso_lambda_1se
## [1] FALSE
lasso_divs_pred <- predict(lasso_divs_train, s = best_lasso_lambda, newx = x[test,])
mean((lasso_divs_pred - y_test)^2)
## Warning in lasso_divs_pred - y_test: longer object length is not a multiple of shorter object length
## Error in mean((lasso_divs_pred - y_test)^2): dims [product 11] do not match the length of object [12]
## Run lasso on the entire dataset
lasso_divs <- glmnet(x, y, alpha = 1, lambda = best_lasso_lambda, standardize = FALSE)
# What are the lasso coefficients?
predict(lasso_divs, type = "coefficients", s = best_lasso_lambda)
## 31 x 1 sparse Matrix of class "dgCMatrix"
## 1
## (Intercept) 3.520111e-16
## Sample_depth_m .
## Temp_C .
## SpCond_uSpercm .
## TDS_mgperL .
## pH -6.755466e-02
## ORP_mV .
## Chl_Lab_ugperL .
## Cl_mgperL .
## SO4_mgperL .
## NO3_mgperL .
## NH3_mgperL .
## TKN_mgperL .
## SRP_ugperL .
## TP_ugperL .
## Alk_mgperL .
## DO_mgperL .
## DO_percent .
## total_bac_abund .
## PC1 .
## PC2 .
## PC3 .
## PC4 .
## PC5 .
## PC6 .
## Richness 4.154369e-01
## Shannon_Entropy .
## Inverse_Simpson .
## Simpsons_Evenness .
## Unweighted_PD -6.310163e-02
## Weighted_PD .
## Run lasso on the entire dataset
lasso_divs_1se <- glmnet(x, y, alpha = 1, lambda = lasso_lambda_1se, standardize = FALSE)
# What are the lasso coefficients?
predict(lasso_divs_1se, type = "coefficients", s = lasso_lambda_1se)
## 31 x 1 sparse Matrix of class "dgCMatrix"
## 1
## (Intercept) -7.854064e-18
## Sample_depth_m .
## Temp_C .
## SpCond_uSpercm .
## TDS_mgperL .
## pH .
## ORP_mV .
## Chl_Lab_ugperL .
## Cl_mgperL .
## SO4_mgperL .
## NO3_mgperL .
## NH3_mgperL .
## TKN_mgperL .
## SRP_ugperL .
## TP_ugperL .
## Alk_mgperL .
## DO_mgperL .
## DO_percent .
## total_bac_abund .
## PC1 .
## PC2 .
## PC3 .
## PC4 .
## PC5 .
## PC6 .
## Richness 9.842520e-02
## Shannon_Entropy .
## Inverse_Simpson .
## Simpsons_Evenness .
## Unweighted_PD .
## Weighted_PD .